from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import re
import plotly.express as px
#Data Cleaning & Filtering
pdf = df.filter(df["SALARY"] > 0).select("EMPLOYMENT_TYPE_NAME", "SALARY").toPandas()
pdf["EMPLOYMENT_TYPE_NAME"] = pdf["EMPLOYMENT_TYPE_NAME"].apply(
lambda x: re.sub(r"[^\x00-\x7F]+", "", str(x)) if x is not None else x
)
median_salaries = pdf.groupby("EMPLOYMENT_TYPE_NAME")["SALARY"].median()
sorted_employment_types = median_salaries.sort_values(ascending=False).index
pdf["EMPLOYMENT_TYPE_NAME"] = pd.Categorical(
pdf["EMPLOYMENT_TYPE_NAME"], \
categories=sorted_employment_types,
ordered=True
)
#Creating the Boxplot
fig = px.box(
pdf,
x="EMPLOYMENT_TYPE_NAME",
y="SALARY",
title="Salary Distribution by Employment Type",
color_discrete_sequence=["#ffb6c1", "#cb1a72ff", "#db7093", "#c71585"],
boxmode="group",
points="all"
)
fig.update_layout(
title=dict(text="Salary Distribution by Employment Type", font=dict(size=30, family="Arial", color="black", weight="bold")),
xaxis=dict(
title=dict(text="Employment Type", font=dict(size=24, family="Arial", color="black", weight="bold")),
tickangle=0,
tickfont=dict(size=18, family="Arial", color="black", weight="bold"),
showline=True, linewidth=2, linecolor="black", mirror=True,
showgrid=False,
categoryorder="array",
categoryarray=sorted_employment_types.tolist()
),
yaxis=dict(
title=dict(text="Salary (K $)", font=dict(size=24, family="Arial", color="black", weight="bold")),
tickvals=[0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000],
ticktext=["0", "50K", "100K", "150K", "200K", "250K", "300K", "350K", "400K", "450K", "500K"],
tickfont=dict(size=18, family="Arial", color="black", weight="bold"),
showline=True, linewidth=2, linecolor="black", mirror=True,
showgrid=True, gridcolor="lightgrey", gridwidth=0.5
),
font=dict(family="Arial", size=16, color="black"),
boxgap=0.7,
plot_bgcolor="white",
paper_bgcolor="white",
showlegend=False,
height=500,
width=850
)
fig.show()
fig.write_image("output/Q1.svg", width=850, height=500, scale=1)